*-----------------------------------------------------------------
version 11
cap claer mata
cap clear matrix
cap clear
cap log close
set more off
cd "E:\REStat_MS14767_Vol96(2)\Data preparation Compustat segment"
log using "9_match_cq.log", replace
*-----------------------------------------------------------------

*****************************************
* starting data2.dta to master_compustat_all_countries.dta
*
* This file reconstructs the connection between match.dta and compustat.dta
* so that the data from compustat dataset may later be merged with patent data.
* It is equivalent to the patents_match.do file.
* Match.dta contains 5000 entries.
**************************************************

set mem 600m

use "compustat+segment.dta", clear
keep ticker cusip name dataset
so ticker
drop if ticker==ticker[_n-1]



****************************************
* In order to find matches within match.dta, 9-digit cusip is changed to 6-digit cusip
* and renamed. cusip9 is the original cusip.
************************************

rename cusip cusip9
gen cusip= substr(cusip9, 1, length(cusip9) - 3)

label var cusip "6-digit cusip from compustat"
label var cusip9 "9-digit cusip from compustat" 

*egen dataset2 = group(dataset)



sort cusip
egen comnum = group(ticker)
egen x = count(comnum), by(cusip)


save "patent files\segment_master_red.dta", replace
*egen x2 = seq(), by(cusip)
*reshape wide ticker name, i(cusip) j(x2)

*save "patent files\segment_master_red_wide.dta", replace




***************************
* match.dta has cusip as identifier, contains 5,000 entries compared to 
* 23,000 entries in compustat
******************************

use "patent files\patents\match.dta", clear


joinby cusip using "patent files\segment_master_red.dta", unmatched(both) 

tab _merge

******************
*------------------------+-----------------------------------
*       master only (1) |      1,829        8.53        8.53
*         using only (2) |     16,542       77.13       85.65
*            matched (3) |      3,077       14.35      100.00
*------------------------+-----------------------------------
*                  Total |     21,448      100.00
*
**************************************

drop if _merge==2
br if x>1 &x !=. & cname !=""

*Here some manual adjustment are needed,:
* for the merge we used cusip6  which does not uniquely identify 
* the Firm:

*cusip	ticker	name						cname
*372917	GZSP	GENZYME SURGICAL PRODUCTS	GENZYME CORP
*372917	GZBX	GENZYME BIOSURGERY			GENZYME CORP
*372917	GZTR	GENZYME TISSUE REPAIR		GENZYME CORP
*372917	GENZ	GENZYME CORP				GENZYME CORP
*372917	GZMO	GENZYME MOLECULAR ONCOLOGY	GENZYME CORP
*373298	TGP.1	GEORGIA-PACIFIC TIMBER CO	GEORGIA-PACIFIC CORP
*373298	GP.1	GEORGIA-PACIFIC CORP		GEORGIA-PACIFIC CORP
*747906	HDD	QUANTUM CORP HDDG				QUANTUM CORP
*747906	QTM	QUANTUM CORP					QUANTUM CORP
*747906	HDD	QUANTUM CORP HDDG				QUANTUM CORP
*747906	QTM	QUANTUM CORP					QUANTUM CORP
*254687	DIG.3	DISNEY (WALT) INTERNET GROUP	DISNEY (WALT) COMPANY
*254687	DIS	DISNEY (WALT) CO					DISNEY (WALT) COMPANY
*254687	DIS	DISNEY (WALT) CO					DISNEY (WALT) COMPANY
*254687	DIG.3	DISNEY (WALT) INTERNET GROUP	DISNEY (WALT) COMPANY
*373298	TGP.1	GEORGIA-PACIFIC TIMBER CO		GEORGIA-PACIFIC CORP
*373298	GP.1	GEORGIA-PACIFIC CORP			GEORGIA-PACIFIC CORP


* I took those where the cname is equal or almost equal to the name variable:

drop if ticker == "GZSP"|ticker =="GZBX"|ticker=="GZTR"|ticker == "GZMO"
drop if ticker =="TGP.1"
drop if ticker == "HDD"
drop if ticker == "DIG.3"


gen cusip_match=0
replace cusip_match=1 if _merge==3
drop _merge x comnum


gen cusip_name=name
replace name=cname
rename cusip9 cusip_cusip9



*label var cname "name in compustat data set"
label var cusip "unique identifier"

label var assignee "assignee identi in compustat data set"
label var pname "name of parent firm"
label var sname "name of subsidary firm"
label var assname "patent assignee name "
label var cusip_cusip9 "9-digit cusip number"
label var cusip_name "original compustat+segment name"
label var cusip_match "valid cusip connection between match and compustat"

desc

save "patent files\cusipmerged.dta", replace
clear

*****************************
* This was merging cusip, now comes merging names with match.dta (now cusipmerged.dta)
*******************************

use "patent files\segment_master_red.dta", clear
drop cusip comnum x
save "patent files\segment_master_red.dta", replace


use "patent files\cusipmerged.dta", clear
joinby name using "patent files\segment_master_red.dta", unmatched(both) 

tab _merge
drop if _merge==2
gen name_match=0
replace name_match=1 if _merge==3
drop _merge
rename cusip9 name_cusip9
count if cusip_match==0 & name_match==0

label var name_match "valid name connection between match and compustat"

save "patent files\cusipnamemerged.dta", replace

count if name_match==1
count if cusip_match==1

****************************
* Now the dataset contains 3003 name-matches and 3231 cusip-matches, 1589 non-matched***
**********************************

insheet using "patent files\manual_matching_cq_ip.csv", clear
save "patent files\manual_matching_cq_ip.dta", replace
drop in 1/1
rename v1 manual_name
rename v2 manual_cusip9
rename v3 cname
rename v4 cusip
rename v5 error
so manual_name
save "patent files\manual_matching_cq.dta", replace

use "patent files\segment_master_red.dta", clear

keep name  cusip9
rename name manual_name
rename cusip9 manual_cusip9
joinby manual_name using "patent files\manual_matching_cq.dta", unmatched(both) 
tab _merge
keep if _merge==3
sort manual_name
keep manual_name manual_cusip9 cname

save "patent files\manual_matching_cq.dta", replace

use "patent files\cusipnamemerged.dta", clear

joinby cname using "patent files\manual_matching_cq.dta", unmatched(both) 
gen manual=0
replace manual=1 if _merge==3 & manual_name~="n"
tab _merge
drop _merge

replace cname=manual_name if manual==1
drop manual_name
replace cusip=manual_cusip9 if manual==1
drop manual_cusip
replace cname=cusip_name if cusip_match==1 & name_match==0
drop cusip_name
replace cusip=name_cusip9 if cusip_match==0 & name_match==1
drop name_cusip9
replace cusip=cusip_cusip9 if cusip_match==1
drop cusip_cusip9 

**********************************
* The following steps prepare and examine the merge between compustat and cusipnamemerged
* in order to append assignee numbers and names to compustat for later merge with patents
**********************************

so assignee
drop if assignee==assignee[_n-1] & cusip == cusip[_n-1]
drop if cusip_match==0 & name_match==0 & manual==0

save "patent files\cusipnamemerged.dta", replace

*******************************************************


use "patent files\segment_master_red.dta", clear
rename cusip9 cusip


merge 1:m cusip using "patent files\cusipnamemerged.dta"
keep if _merge==3
tab _merge
drop _merge

so cusip
count if cusip!=cusip[_n-1]

*************************************************************
* Here it is checked for non-american entries, 
* those are dropped.
******************************************************



save "patent files\cusipnamemerged.dta", replace

keep assignee cusip ticker 
so assignee
desc
save "patent files\cusipnamemerged_red.dta", replace

desc

log close


